library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.4
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
list.files(path = "../input")
## character(0)
train = read.csv("~/Desktop/HTrainW19Final.csv")
test = read.csv("~/Desktop/HTestW19Final No Y values.csv")
head(train)
## Ob MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 50 RL 82 12375 Pave <NA> Reg Lvl
## 2 2 30 RM 60 10800 Pave Grvl Reg Lvl
## 3 3 45 RM 58 6380 Pave <NA> Reg Lvl
## 4 4 20 RL 60 7200 Pave <NA> Reg Lvl
## 5 5 60 FV 100 13162 Pave <NA> Reg Lvl
## 6 6 80 RL 61 9734 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl Sawyer Feedr Norm 1Fam
## 2 AllPub Inside Gtl OldTown Norm Norm 1Fam
## 3 AllPub Inside Gtl BrkSide Norm Norm 1Fam
## 4 AllPub Inside Gtl NAmes Norm Norm 1Fam
## 5 AllPub Corner Gtl Somerst Feedr Norm 1Fam
## 6 AllPub Inside Gtl Gilbert RRAn Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 1.5Fin 5 5 1951 1951 Gable CompShg
## 2 1Story 4 7 1885 1995 Mansard CompShg
## 3 1.5Unf 5 6 1922 1950 Gable CompShg
## 4 1Story 5 8 1950 2002 Gable CompShg
## 5 2Story 9 5 2006 2006 Gable CompShg
## 6 SLvl 7 5 2004 2004 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 HdBoard HdBoard Stone 41 TA Fa CBlock
## 2 VinylSd VinylSd None 0 TA TA BrkTil
## 3 MetalSd MetalSd None 0 TA TA BrkTil
## 4 VinylSd VinylSd None 0 TA TA CBlock
## 5 VinylSd VinylSd None 0 Gd TA PConc
## 6 VinylSd VinylSd None 0 Gd TA PConc
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 TA TA No BLQ 329 Unf
## 2 Fa TA No Unf 0 Unf
## 3 TA Fa No Unf 0 Unf
## 4 TA TA No ALQ 398 BLQ
## 5 Ex TA No GLQ 1836 Unf
## 6 Gd TA Mn GLQ 241 Rec
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 477 806 GasA TA Y SBrkr
## 2 0 641 641 GasA Gd Y SBrkr
## 3 0 993 993 GasA TA Y FuseA
## 4 149 317 864 GasA Gd Y SBrkr
## 5 0 200 2036 GasA Ex Y SBrkr
## 6 113 30 384 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 1081 341 0 1422 1 0 1
## 2 1047 0 0 1047 0 0 1
## 3 1048 0 0 1048 0 0 1
## 4 864 0 0 864 1 0 1
## 5 2036 604 0 2640 1 0 3
## 6 744 630 0 1374 0 0 2
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 0 3 1 TA 7 Typ
## 2 0 2 1 TA 6 Typ
## 3 0 2 1 TA 5 Typ
## 4 0 3 1 Gd 5 Typ
## 5 1 3 1 Ex 11 Typ
## 6 1 3 1 Gd 7 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 1 TA Detchd 1951 Unf 1
## 2 0 <NA> Detchd 1954 Unf 1
## 3 1 Gd Detchd 1922 Unf 1
## 4 0 <NA> Detchd 1980 RFn 2
## 5 1 Gd Attchd 2006 RFn 3
## 6 0 <NA> BuiltIn 2004 Fin 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 288 TA TA Y 0 0
## 2 273 Fa Fa N 0 0
## 3 280 TA TA Y 0 0
## 4 720 TA TA Y 194 0
## 5 792 TA TA Y 0 265
## 6 400 TA TA Y 0 0
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> GdWo <NA>
## 2 0 0 0 0 <NA> <NA> Shed
## 3 116 0 0 0 <NA> <NA> <NA>
## 4 0 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 0 0 0 <NA> <NA> <NA>
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 6 2006 WD Normal 131878.1
## 2 450 8 2007 WD Normal 104069.3
## 3 0 8 2006 WD Normal 116843.7
## 4 0 7 2007 WD Normal 132932.2
## 5 0 11 2006 New Partial 429077.4
## 6 0 5 2009 WD Normal 174735.6
head(test)
## Ob MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 20 RL 53 3710 Pave <NA> Reg Lvl
## 2 2 60 RL NA 10304 Pave <NA> IR1 Lvl
## 3 3 75 RM 90 8100 Pave <NA> Reg Lvl
## 4 4 20 RL 80 14680 Pave Grvl IR1 HLS
## 5 5 90 RM 110 8472 Grvl <NA> IR2 Bnk
## 6 6 20 FV 72 8640 Pave <NA> Reg Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl Blmngtn Norm Norm 1Fam
## 2 AllPub CulDSac Gtl NWAmes PosN Norm 1Fam
## 3 AllPub Corner Gtl OldTown Norm Norm 1Fam
## 4 AllPub Inside Gtl Crawfor Norm Norm 1Fam
## 5 AllPub Corner Mod IDOTRR RRNn Norm Duplex
## 6 AllPub Inside Gtl Somerst Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 1Story 7 5 2007 2008 Gable CompShg
## 2 2Story 5 7 1976 1976 Gable CompShg
## 3 2.5Unf 5 5 1898 1965 Hip CompShg
## 4 1Story 5 4 1960 1960 Gable CompShg
## 5 1Story 5 5 1963 1963 Gable CompShg
## 6 1Story 8 5 2007 2008 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 WdShing Wd Shng BrkFace 20 Gd TA PConc
## 2 Plywood Plywood BrkFace 44 TA Gd CBlock
## 3 AsbShng AsbShng None 0 TA TA PConc
## 4 MetalSd MetalSd None 0 TA TA CBlock
## 5 Wd Sdng Wd Sdng None 0 Fa TA CBlock
## 6 VinylSd VinylSd None 0 Gd TA PConc
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA Gd Unf 0 Unf
## 2 TA TA No ALQ 381 Unf
## 3 TA TA No Unf 0 Unf
## 4 TA TA No Rec 793 Unf
## 5 Gd TA Gd LwQ 104 GLQ
## 6 Gd TA No GLQ 24 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 1146 1146 GasA Ex Y SBrkr
## 2 0 399 780 GasA Ex Y SBrkr
## 3 0 849 849 GasA TA N FuseA
## 4 0 480 1273 GasA Ex Y SBrkr
## 5 712 0 816 GasA TA N SBrkr
## 6 0 1339 1363 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 1246 0 0 1246 0 0 2
## 2 1088 780 0 1868 1 0 2
## 3 1075 1063 0 2138 0 0 2
## 4 1273 0 0 1273 0 0 1
## 5 816 0 0 816 1 0 1
## 6 1372 0 0 1372 0 0 2
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 0 2 1 Gd 5 Typ
## 2 1 4 1 Gd 9 Typ
## 3 0 2 3 TA 11 Typ
## 4 0 2 1 TA 5 Typ
## 5 0 2 1 TA 5 Typ
## 6 0 3 1 Gd 6 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 1 Gd Attchd 2007 Fin 2
## 2 1 TA Attchd 1976 Unf 2
## 3 0 <NA> Detchd 1910 Unf 2
## 4 0 <NA> Attchd 1960 Unf 1
## 5 0 <NA> CarPort 1963 Unf 2
## 6 0 <NA> Attchd 2008 RFn 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 428 TA TA Y 100 24
## 2 484 TA TA Y 448 96
## 3 360 Fa Po N 40 156
## 4 307 TA TA Y 483 0
## 5 516 TA TA Y 106 0
## 6 588 TA TA Y 192 113
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> MnPrv <NA>
## 4 0 0 115 0 <NA> MnPrv <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 0 0 0 <NA> <NA> <NA>
## MiscVal MoSold YrSold SaleType SaleCondition
## 1 0 3 2008 New Partial
## 2 0 10 2009 WD Normal
## 3 0 11 2009 WD Normal
## 4 0 6 2009 WD Normal
## 5 0 5 2010 WD Normal
## 6 0 7 2008 New Partial
dim(train)
## [1] 2500 81
dim(test)
## [1] 1500 80
colSums(is.na(train))
## Ob MSSubClass MSZoning LotFrontage LotArea
## 0 0 4 394 0
## Street Alley LotShape LandContour Utilities
## 0 2323 0 0 2
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 19 16 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 71 70 70 68 1
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 69 1 1 1 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 0 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 3 3 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 1 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 1 0 1225 134 134
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 134 0 0 134 134
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 2491 2018 2421
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 1 0
## SalePrice
## 0
#remove missing values for train data
#I chose to delete the following columns: Alley, PoolQC, Fence, MiscFeature because there are too many missing values.
grep(c("Alley"), colnames(train))
## [1] 7
grep(c("PoolQC"), colnames(train))
## [1] 73
grep(c("Fence"), colnames(train))
## [1] 74
grep(c("MiscFeature"), colnames(train))
## [1] 75
train2 = train[, c(-7,-73,-74,-75)]
#LotFrontage is a numerical column with 394 NAs, I choose to replace all NAs by the mean
train2$LotFrontage[which(is.na(train2$LotFrontage))] <- mean(na.omit(train2$LotFrontage))
#MasVnrArea is a numerical column with 16 NAs, I choose to replace all NAs by the mean
train2$MasVnrArea[which(is.na(train2$MasVnrArea))] <- mean(na.omit(train2$MasVnrArea))
#GaraageYrBlt is a numerical column witnh 134 NAs, I choose to replace all NAs by the median
train2$GarageYrBlt[which(is.na(train2$GarageYrBlt))] <- median(na.omit(train2$GarageYrBlt))
#BsmtFinSF1 is a numerical column witnh 1 NAs, I choose to replace all NAs by the mean
train2$BsmtFinSF1[which(is.na(train2$BsmtFinSF1))] <- median(na.omit(train2$BsmtFinSF1))
#BsmtFinSF2 is a numerical column witnh 69 NAs, I choose to replace all NAs by the mean
train2$BsmtFinSF2[which(is.na(train2$BsmtFinSF2))] <- median(na.omit(train2$BsmtFinSF2))
#BsmtUnfSF is a numerical column witnh 1 NAs, I choose to replace all NAs by the mean
train2$BsmtUnfSF[which(is.na(train2$BsmtUnfSF))] <- median(na.omit(train2$BsmtUnfSF))
#TotalBsmtSF is a numerical column witnh 1 NAs, I choose to replace all NAs by the mean
train2$TotalBsmtSF[which(is.na(train2$TotalBsmtSF))] <- median(na.omit(train2$TotalBsmtSF))
#BsmtFullBath is a numerical column witnh 1 NAs, I choose to replace all NAs by the mean
train2$BsmtFullBath[which(is.na(train2$BsmtFullBath))] <- median(na.omit(train2$BsmtFullBath))
#BsmtHalfBath is a numerical column witnh 1 NAs, I choose to replace all NAs by the mean
train2$BsmtHalfBath[which(is.na(train2$BsmtHalfBath))] <- median(na.omit(train2$BsmtHalfBath))
#remove missing values for categorical variables
#MSZoing
train2$MSZoning[which(is.na(train2$MSZoning))] <- as.character(train2$MSZoning[which.max(table(train2$MSZoning))])
#Utilities
train2$Utilities[which(is.na(train2$Utilities))] <- as.character(train2$Utilities[which.max(table(train2$Utilities))])
#MasVnrTyoe
train2$MasVnrType[which(is.na(train2$MasVnrType))] <- as.character(train2$MasVnrType[which.max(table(train2$MasVnrType))])
#BsmtQual
train2$BsmtQual[which(is.na(train2$BsmtQual))] <- as.character(train2$BsmtQual[which.max(table(train2$BsmtQual))])
#BsmtCond
train2$BsmtCond[which(is.na(train2$BsmtCond))] <- as.character(train2$BsmtCond[which.max(table(train2$BsmtCond))])
#BsmtExposure
train2$BsmtExposure[which(is.na(train2$BsmtExposure))] <- as.character(train2$BsmtExposure[which.max(table(train2$BsmtExposure))])
#BsmtFinType1
train2$BsmtFinType1[which(is.na(train2$BsmtFinType1))] <- as.character(train2$BsmtFinType1[which.max(table(train2$BsmtFinType1))])
#BsmtFinType2
train2$BsmtFinType2[which(is.na(train2$BsmtFinType2))] <- as.character(train2$BsmtFinType2[which.max(table(train2$BsmtFinType2))])
#kitchenQual
train2$KitchenQual[which(is.na(train2$KitchenQual))] <- as.character(train2$KitchenQual[which.max(table(train2$KitchenQual))])
#GarageCond
train2$GarageCond[which(is.na(train2$GarageCond))] <- as.character(train2$GarageCond[which.max(table(train2$GarageCond))])
#GaraQual
train2$GarageQual[which(is.na(train2$GarageQual))] <- as.character(train2$GarageQual[which.max(table(train2$GarageQual))])
#BsmtQual
train2$SaleType[which(is.na(train2$SaleType))] <- as.character(train2$SaleType[which.max(table(train2$SaleType))])
#Functional
train2$Functional[which(is.na(train2$Functional))] <- as.character(train2$Functional[which.max(table(train2$Functional))])
#FireplaceQu
train2$FireplaceQu[which(is.na(train2$FireplaceQu))] <- as.character(train2$FireplaceQu[which.max(table(train2$FireplaceQu))])
#GarageType
train2$GarageType[which(is.na(train2$GarageType))] <- as.character(train2$GarageType[which.max(table(train2$GarageType))])
#GarageFinish
train2$GarageFinish[which(is.na(train2$GarageFinish))] <- as.character(train2$GarageFinish[which.max(table(train2$GarageFinish))])
colSums(is.na(train2))
## Ob MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 0 0
## Street LotShape LandContour Utilities LotConfig
## 0 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 0 0 0 0 0
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 0 0 0 0 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 0 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 0 0 0 0
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 0 0 0 0 0
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## 0 0 0 0 0
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 0 0 0 0 0
## PoolArea MiscVal MoSold YrSold SaleType
## 0 0 0 0 0
## SaleCondition SalePrice
## 0 0
sum(is.na(train2))
## [1] 0
#remove all the NAs for the test data
grep(c("Alley"), colnames(test))
## [1] 7
grep(c("PoolQC"), colnames(test))
## [1] 73
grep(c("Fence"), colnames(test))
## [1] 74
grep(c("MiscFeature"), colnames(test))
## [1] 75
grep(c("FireplaceQu"), colnames(test))
## [1] 58
test2 = test[, c(-7,-73,-74,-75,-58)]
#MSZoing
test2$MSZoning[which(is.na(test2$MSZoning))] <- as.character(test2$MSZoning[which.max(table(test2$MSZoning))])
#LotFrontage
test2$LotFrontage[which(is.na(test2$LotFrontage))] <- mean(na.omit(test2$LotFrontage))
#Exterior1st
test2$Exterior1st[which(is.na(test2$Exterior1st))] <- as.character(test2$Exterior1st[which.max(table(test2$Exterior1st))])
#Exterior12nd
test2$Exterior2nd[which(is.na(test2$Exterior2nd))] <- as.character(test2$Exterior2nd[which.max(table(test2$Exterior2nd))])
#MasVnrType
test2$MasVnrType[which(is.na(test2$MasVnrArea))] <- as.character(test2$MasVnrType[which.max(table(test2$MasVnrType))])
#MasVnrArea
test2$MasVnrArea[which(is.na(test2$MasVnrArea))] <- mean(na.omit(test2$MasVnrArea))
#BsmtQual
test2$BsmtQual[which(is.na(test2$BsmtQual))] <- as.character(test2$BsmtQual[which.max(table(test2$BsmtQual))])
#BsmtCond
test2$BsmtCond[which(is.na(test2$BsmtCond))] <- as.character(test2$BsmtCond[which.max(table(test2$BsmtCond))])
#BsmtExposure
test2$BsmtExposure[which(is.na(test2$BsmtExposure))] <- as.character(test2$BsmtExposure[which.max(table(test2$BsmtExposure))])
#BsmtFinType1
test2$BsmtFinType1[which(is.na(test2$BsmtFinType1))] <- as.character(test2$BsmtFinType1[which.max(table(test2$BsmtFinType1))])
#BsmtFinType2
test2$BsmtFinType2[which(is.na(test2$BsmtFinType2))] <- as.character(test2$BsmtFinType2[which.max(table(test2$BsmtFinType2))])
#BsmtFinSF1
test2$BsmtFinSF1[which(is.na(test2$BsmtFinSF1))] <- median(na.omit(test2$BsmtFinSF1))
#BsmtFinSF2
test2$BsmtFinSF2[which(is.na(test2$BsmtFinSF2))] <- median(na.omit(test2$BsmtFinSF2))
#BsmtUnfSF
test2$BsmtUnfSF[which(is.na(test2$BsmtUnfSF))] <- median(na.omit(test2$BsmtUnfSF))
#TotalBsmtSF
test2$TotalBsmtSF[which(is.na(test2$TotalBsmtSF))] <- median(na.omit(test2$TotalBsmtSF))
#Electrical
test2$Electrical[which(is.na(test2$Electrical))] <- as.character(test2$Electrical[which.max(table(test2$Electrical))])
#Functional
test2$Functional[which(is.na(test2$Functional))] <- as.character(test2$Functional[which.max(table(test2$Functional))])
#FireplaceQu
test2$FireplaceQu[which(is.na(test2$FireplaceQu))] <- as.character(test2$FireplaceQu[which.max(table(test2$FireplaceQu))])
#GarageType
test2$GarageType[which(is.na(test2$GarageType))] <- as.character(test2$GarageType[which.max(table(test2$GarageType))])
#GarageFinish
test2$GarageFinish[which(is.na(test2$GarageFinish))] <- as.character(test2$GarageFinish[which.max(table(test2$GarageFinish))])
#GarageYrBlt
test2$GarageYrBlt[which(is.na(test2$GarageYrBlt))] <- median(na.omit(test2$GarageYrBlt))
#GarageCars
test2$GarageCars[which(is.na(test2$GarageCars))] <- mean(na.omit(test2$GarageCars))
#kitchenQual
test2$KitchenQual[which(is.na(test2$KitchenQual))] <- as.character(test2$KitchenQual[which.max(table(test2$KitchenQual))])
#BsmtFullBath
test2$BsmtFullBath[which(is.na(test2$BsmtFullBath))] <- median(na.omit(test2$BsmtFullBath))
#BsmtHalfBath
test2$BsmtHalfBath[which(is.na(test2$BsmtHalfBath))] <- median(na.omit(test2$BsmtHalfBath))
#GarageCond
test2$GarageCond[which(is.na(test2$GarageCond))] <- as.character(test2$GarageCond[which.max(table(test2$GarageCond))])
#GaraQual
test2$GarageQual[which(is.na(test2$GarageQual))] <- as.character(test2$GarageQual[which.max(table(test2$GarageQual))])
#GarageArea
test2$GarageArea[which(is.na(test2$GarageArea))] <- mean(na.omit(test2$GarageArea))
colSums(is.na(test2))
## Ob MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 0 0
## Street LotShape LandContour Utilities LotConfig
## 0 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 0 0 0 0 0
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 0 0 0 0 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 0 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 0 0 0 0
## Fireplaces GarageType GarageYrBlt GarageFinish GarageCars
## 0 0 0 0 0
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 0 0 0 0 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## 0 0 0 0 0
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
train2$MSSubClass = remove_outliers(train2$MSSubClass)
train2$LotFrontage = remove_outliers(train2$LotFrontage)
train2$LotArea = remove_outliers(train2$LotArea)
train2$OverallQual = remove_outliers(train2$OverallQual)
train2$OverallCond = remove_outliers(train2$OverallCond)
train2$YearBuilt = remove_outliers(train2$YearBuilt)
train2$YearRemodAdd = remove_outliers(train2$YearRemodAdd)
train2$MasVnrArea = remove_outliers(train2$MasVnrArea)
train2$BsmtFinSF1 = remove_outliers(train2$BsmtFinSF1)
train2$BsmtFinSF2 = remove_outliers(train2$BsmtFinSF2)
train2$BsmtUnfSF = remove_outliers(train2$BsmtUnfSF)
train2$TotalBsmtSF = remove_outliers(train2$TotalBsmtSF)
train2$X1stFlrSF = remove_outliers(train2$X1stFlrSF)
train2$X2ndFlrSF = remove_outliers(train2$X2ndFlrSF)
train2$LowQualFinSF = remove_outliers(train2$LowQualFinSF)
train2$GrLivArea = remove_outliers(train2$GrLivArea)
train2$BsmtFullBath = remove_outliers(train2$BsmtFullBath)
train2$BsmtHalfBath = remove_outliers(train2$BsmtHalfBath)
train2$FullBath = remove_outliers(train2$FullBath)
train2$BedroomAbvGr = remove_outliers(train2$BedroomAbvGr)
train2$KitchenAbvGr = remove_outliers(train2$KitchenAbvGr)
train2$TotRmsAbvGrd = remove_outliers(train2$TotRmsAbvGrd)
train2$Fireplaces = remove_outliers(train2$Fireplaces)
train2$GarageYrBlt = remove_outliers(train2$GarageYrBlt)
train2$GarageCars = remove_outliers(train2$GarageCars)
train2$GarageArea = remove_outliers(train2$GarageArea)
train2$WoodDeckSF = remove_outliers(train2$WoodDeckSF)
train2$OpenPorchSF = remove_outliers(train2$OpenPorchSF)
train2$EnclosedPorch = remove_outliers(train2$EnclosedPorch)
train2$X3SsnPorch = remove_outliers(train2$X3SsnPorch)
train2$ScreenPorch = remove_outliers(train2$ScreenPorch)
train2$PoolArea = remove_outliers(train2$PoolArea)
train2$MiscVal = remove_outliers(train2$MiscVal)
#remove NAs for train2
#MSSubclass
train2$MSSubClass[which(is.na(train2$MSSubClass))] <- median(na.omit(train2$MSSubClass))
#LotFrontage
train2$LotFrontage[which(is.na(train2$LotFrontage))] <- median(na.omit(train2$LotFrontage))
#MSZoing
train2$MSZoning[which(is.na(train2$MSZoning))] <- as.character(train2$MSZoning[which.max(table(train2$MSZoning))])
train2$LotArea[which(is.na(train2$LotArea))] <- median(na.omit(train2$LotArea))
train2$OverallCond[which(is.na(train2$OverallCond))] <- median(na.omit(train2$OverallCond))
train2$OverallQual[which(is.na(train2$OverallQual))] <- median(na.omit(train2$OverallQual))
train2$YearBuilt[which(is.na(train2$YearBuilt))] <- median(na.omit(train2$YearBuilt))
train2$MasVnrArea[which(is.na(train2$MasVnrArea))] <- median(na.omit(train2$MasVnrArea))
train2$BsmtFinSF1[which(is.na(train2$BsmtFinSF1))] <- median(na.omit(train2$BsmtFinSF1))
train2$BsmtFinSF2[which(is.na(train2$BsmtFinSF2))] <- median(na.omit(train2$BsmtFinSF2))
train2$BsmtUnfSF[which(is.na(train2$BsmtUnfSF))] <- median(na.omit(train2$BsmtUnfSF))
train2$TotalBsmtSF[which(is.na(train2$TotalBsmtSF))] <- median(na.omit(train2$TotalBsmtSF))
train2$X1stFlrSF[which(is.na(train2$X1stFlrSF))] <- median(na.omit(train2$X1stFlrSF))
train2$X2ndFlrSF[which(is.na(train2$X2ndFlrSF))] <- median(na.omit(train2$X2ndFlrSF))
train2$LowQualFinSF[which(is.na(train2$LowQualFinSF))] <- median(na.omit(train2$LowQualFinSF))
train2$GrLivArea[which(is.na(train2$GrLivArea))] <- median(na.omit(train2$GrLivArea))
train2$BsmtFullBath[which(is.na(train2$BsmtFullBath))] <- median(na.omit(train2$BsmtFullBath))
train2$BsmtHalfBath[which(is.na(train2$BsmtHalfBath))] <- median(na.omit(train2$BsmtHalfBath))
train2$FullBath[which(is.na(train2$FullBath))] <- median(na.omit(train2$FullBath))
train2$BedroomAbvGr[which(is.na(train2$BedroomAbvGr))] <- median(na.omit(train2$BedroomAbvGr))
train2$KitchenQual[which(is.na(train2$KitchenQual))] <- as.character(train2$KitchenQual[which.max(table(train2$KitchenQual))])
train2$TotRmsAbvGrd[which(is.na(train2$TotRmsAbvGrd))] <- median(na.omit(train2$TotRmsAbvGrd))
train2$Fireplaces[which(is.na(train2$Fireplaces))] <- median(na.omit(train2$Fireplaces))
train2$GarageYrBlt[which(is.na(train2$GarageYrBlt))] <- median(na.omit(train2$GarageYrBlt))
train2$GarageCars[which(is.na(train2$GarageCars))] <- median(na.omit(train2$GarageCars))
train2$GarageArea[which(is.na(train2$GarageArea))] <- median(na.omit(train2$GarageArea))
train2$WoodDeckSF[which(is.na(train2$WoodDeckSF))] <- median(na.omit(train2$WoodDeckSF))
train2$OpenPorchSF[which(is.na(train2$OpenPorchSF))] <- median(na.omit(train2$OpenPorchSF))
train2$EnclosedPorch[which(is.na(train2$EnclosedPorch))] <- median(na.omit(train2$EnclosedPorch))
train2$X3SsnPorch[which(is.na(train2$X3SsnPorch))] <- median(na.omit(train2$X3SsnPorch))
train2$ScreenPorch[which(is.na(train2$ScreenPorch))] <- median(na.omit(train2$ScreenPorch))
train2$PoolArea[which(is.na(train2$PoolArea))] <- median(na.omit(train2$PoolArea))
train2$MiscVal[which(is.na(train2$MiscVal))] <- median(na.omit(train2$MiscVal))
colSums(is.na(train2))
## Ob MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 0 0
## Street LotShape LandContour Utilities LotConfig
## 0 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 0 0 0 0 0
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 0 0 0 0 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 0 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 126 0 0 0
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 0 0 0 0 0
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## 0 0 0 0 0
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 0 0 0 0 0
## PoolArea MiscVal MoSold YrSold SaleType
## 0 0 0 0 0
## SaleCondition SalePrice
## 0 0
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
test2$MSSubClass = remove_outliers(test2$MSSubClass)
test2$LotFrontage = remove_outliers(test2$LotFrontage)
test2$LotArea = remove_outliers(test2$LotArea)
test2$OverallQual = remove_outliers(test2$OverallQual)
test2$OverallCond = remove_outliers(test2$OverallCond)
test2$YearBuilt = remove_outliers(test2$YearBuilt)
test2$YearRemodAdd = remove_outliers(test2$YearRemodAdd)
test2$MasVnrArea = remove_outliers(test2$MasVnrArea)
test2$BsmtFinSF1 = remove_outliers(test2$BsmtFinSF1)
test2$BsmtFinSF2 = remove_outliers(test2$BsmtFinSF2)
test2$BsmtUnfSF = remove_outliers(test2$BsmtUnfSF)
test2$TotalBsmtSF = remove_outliers(test2$TotalBsmtSF)
test2$X1stFlrSF = remove_outliers(test2$X1stFlrSF)
test2$X2ndFlrSF = remove_outliers(test2$X2ndFlrSF)
test2$LowQualFinSF = remove_outliers(test2$LowQualFinSF)
test2$GrLivArea = remove_outliers(test2$GrLivArea)
test2$BsmtFullBath = remove_outliers(test2$BsmtFullBath)
test2$BsmtHalfBath = remove_outliers(test2$BsmtHalfBath)
test2$FullBath = remove_outliers(test2$FullBath)
test2$BedroomAbvGr = remove_outliers(test2$BedroomAbvGr)
test2$KitchenAbvGr = remove_outliers(test2$KitchenAbvGr)
test2$TotRmsAbvGrd = remove_outliers(test2$TotRmsAbvGrd)
test2$Fireplaces = remove_outliers(test2$Fireplaces)
test2$GarageYrBlt = remove_outliers(test2$GarageYrBlt)
test2$GarageCars = remove_outliers(test2$GarageCars)
test2$GarageArea = remove_outliers(test2$GarageArea)
test2$WoodDeckSF = remove_outliers(test2$WoodDeckSF)
test2$OpenPorchSF = remove_outliers(test2$OpenPorchSF)
test2$EnclosedPorch = remove_outliers(test2$EnclosedPorch)
test2$X3SsnPorch = remove_outliers(test2$X3SsnPorch)
test2$ScreenPorch = remove_outliers(test2$ScreenPorch)
test2$PoolArea = remove_outliers(test2$PoolArea)
test2$MiscVal = remove_outliers(test2$MiscVal)
test2$MSSubClass[which(is.na(test2$MSSubClass))] <- median(na.omit(test2$MSSubClass))
test2$LotFrontage[which(is.na(test2$LotFrontage))] <- mean(na.omit(test2$LotFrontage))
test2$LotArea[which(is.na(test2$LotArea))] <- median(na.omit(test2$LotArea))
test2$OverallCond[which(is.na(test2$OverallCond))] <- median(na.omit(test2$OverallCond))
test2$OverallQual[which(is.na(test2$OverallQual))] <- median(na.omit(test2$OverallQual))
test2$YearBuilt[which(is.na(test2$YearBuilt))] <- median(na.omit(test2$YearBuilt))
test2$MasVnrArea[which(is.na(test2$MasVnrArea))] <- median(na.omit(test2$MasVnrArea))
test2$BsmtFinSF1[which(is.na(test2$BsmtFinSF1))] <- median(na.omit(test2$BsmtFinSF1))
test2$BsmtFinSF2[which(is.na(test2$BsmtFinSF2))] <- median(na.omit(test2$BsmtFinSF2))
test2$BsmtUnfSF[which(is.na(test2$BsmtUnfSF))] <- median(na.omit(test2$BsmtUnfSF))
test2$TotalBsmtSF[which(is.na(test2$TotalBsmtSF))] <- median(na.omit(test2$TotalBsmtSF))
test2$X1stFlrSF[which(is.na(test2$X1stFlrSF))] <- median(na.omit(test2$X1stFlrSF))
test2$X2ndFlrSF[which(is.na(test2$X2ndFlrSF))] <- median(na.omit(test2$X2ndFlrSF))
test2$LowQualFinSF[which(is.na(test2$LowQualFinSF))] <- median(na.omit(test2$LowQualFinSF))
test2$GrLivArea[which(is.na(test2$GrLivArea))] <- median(na.omit(test2$GrLivArea))
test2$BsmtHalfBath[which(is.na(test2$BsmtHalfBath))] <- median(na.omit(test2$BsmtHalfBath))
test2$FullBath[which(is.na(test2$FullBath))] <- median(na.omit(test2$FullBath))
test2$BedroomAbvGr[which(is.na(test2$BedroomAbvGr))] <- median(na.omit(test2$BedroomAbvGr))
test2$TotRmsAbvGrd[which(is.na(test2$TotRmsAbvGrd))] <- median(na.omit(test2$TotRmsAbvGrd))
test2$Fireplaces[which(is.na(test2$Fireplaces))] <- median(na.omit(test2$Fireplaces))
test2$GarageYrBlt[which(is.na(test2$GarageYrBlt))] <- median(na.omit(test2$GarageYrBlt))
test2$GarageCars[which(is.na(test2$GarageCars))] <- median(na.omit(test2$GarageCars))
test2$GarageArea[which(is.na(test2$GarageArea))] <- median(na.omit(test2$GarageArea))
test2$WoodDeckSF[which(is.na(test2$WoodDeckSF))] <- median(na.omit(test2$WoodDeckSF))
test2$OpenPorchSF[which(is.na(test2$OpenPorchSF))] <- median(na.omit(test2$OpenPorchSF))
test2$EnclosedPorch[which(is.na(test2$EnclosedPorch))] <- median(na.omit(test2$EnclosedPorch))
test2$X3SsnPorch[which(is.na(test2$X3SsnPorch))] <- median(na.omit(test2$X3SsnPorch))
test2$ScreenPorch[which(is.na(test2$ScreenPorch))] <- median(na.omit(test2$ScreenPorch))
test2$PoolArea[which(is.na(test2$PoolArea))] <- median(na.omit(test2$PoolArea))
test2$MiscVal[which(is.na(test2$MiscVal))] <- median(na.omit(test2$MiscVal))
colSums(is.na(train2))
## Ob MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 0 0
## Street LotShape LandContour Utilities LotConfig
## 0 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 0 0 0 0 0
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 0 0 0 0 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 0 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 126 0 0 0
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 0 0 0 0 0
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## 0 0 0 0 0
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 0 0 0 0 0
## PoolArea MiscVal MoSold YrSold SaleType
## 0 0 0 0 0
## SaleCondition SalePrice
## 0 0
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
#First selecting numerical variables
model <- lm(SalePrice ~ MSZoning + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF +TotalBsmtSF+ X1stFlrSF + X2ndFlrSF + KitchenAbvGr+ LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, data = train2)
summary(model)
##
## Call:
## lm(formula = SalePrice ~ MSZoning + LotFrontage + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF +
## KitchenAbvGr + LowQualFinSF + GrLivArea + BsmtFullBath +
## BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageYrBlt + GarageArea + WoodDeckSF + OpenPorchSF +
## EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal +
## MoSold + YrSold, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -140209 -16660 -2997 11119 386127
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.923e+05 9.858e+05 0.499 0.617540
## MSZoningFV -1.530e+04 9.310e+03 -1.643 0.100496
## MSZoningRH -2.797e+04 1.243e+04 -2.250 0.024528 *
## MSZoningRL -1.607e+04 8.734e+03 -1.839 0.065974 .
## MSZoningRM -1.229e+04 8.732e+03 -1.407 0.159557
## LotFrontage 1.792e+02 5.220e+01 3.432 0.000609 ***
## LotArea 1.523e+00 2.632e-01 5.788 8.08e-09 ***
## OverallQual 2.320e+04 7.748e+02 29.942 < 2e-16 ***
## OverallCond 1.690e+03 9.223e+02 1.832 0.067090 .
## YearBuilt 1.488e+02 4.705e+01 3.162 0.001585 **
## YearRemodAdd 1.020e+02 4.561e+01 2.237 0.025392 *
## MasVnrArea -2.853e+01 6.561e+00 -4.348 1.43e-05 ***
## BsmtFinSF1 3.421e+01 2.716e+00 12.595 < 2e-16 ***
## BsmtFinSF2 NA NA NA NA
## BsmtUnfSF 6.420e+00 2.629e+00 2.442 0.014700 *
## TotalBsmtSF 1.196e+01 4.060e+00 2.947 0.003244 **
## X1stFlrSF 3.014e+01 4.511e+00 6.681 2.96e-11 ***
## X2ndFlrSF 1.645e+01 3.498e+00 4.704 2.70e-06 ***
## KitchenAbvGr NA NA NA NA
## LowQualFinSF NA NA NA NA
## GrLivArea 6.984e+00 3.334e+00 2.095 0.036297 *
## BsmtFullBath 3.464e+03 1.683e+03 2.059 0.039631 *
## BsmtHalfBath NA NA NA NA
## FullBath 4.293e+03 1.932e+03 2.221 0.026423 *
## HalfBath 5.375e+03 1.899e+03 2.831 0.004681 **
## BedroomAbvGr -4.121e+03 1.284e+03 -3.211 0.001342 **
## TotRmsAbvGrd 6.959e+03 8.105e+02 8.585 < 2e-16 ***
## Fireplaces 5.688e+03 1.257e+03 4.523 6.39e-06 ***
## GarageYrBlt 5.326e+01 4.942e+01 1.078 0.281285
## GarageArea 3.093e+01 4.666e+00 6.629 4.19e-11 ***
## WoodDeckSF 1.733e+01 6.390e+00 2.711 0.006748 **
## OpenPorchSF 3.599e+01 1.561e+01 2.306 0.021215 *
## EnclosedPorch NA NA NA NA
## X3SsnPorch NA NA NA NA
## ScreenPorch NA NA NA NA
## PoolArea NA NA NA NA
## MiscVal NA NA NA NA
## MoSold 7.022e+01 2.465e+02 0.285 0.775771
## YrSold -6.042e+02 4.889e+02 -1.236 0.216703
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30770 on 2344 degrees of freedom
## (126 observations deleted due to missingness)
## Multiple R-squared: 0.8503, Adjusted R-squared: 0.8484
## F-statistic: 459 on 29 and 2344 DF, p-value: < 2.2e-16
# keep variables with siginicant p-values
model.1 <- lm(SalePrice ~ LotArea + OverallQual + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF +TotalBsmtSF + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + TotRmsAbvGrd + Fireplaces + GarageArea + ScreenPorch + PoolArea + MiscVal, data = train2)
summary(model.1)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + YearBuilt +
## YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF +
## TotalBsmtSF + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr +
## TotRmsAbvGrd + Fireplaces + GarageArea + ScreenPorch + PoolArea +
## MiscVal, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148337 -16198 -2534 11506 408221
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.745e+05 7.886e+04 -11.089 < 2e-16 ***
## LotArea 1.897e+00 2.278e-01 8.329 < 2e-16 ***
## OverallQual 2.380e+04 7.421e+02 32.079 < 2e-16 ***
## YearBuilt 2.443e+02 3.197e+01 7.643 3.00e-14 ***
## YearRemodAdd 1.451e+02 4.205e+01 3.452 0.000567 ***
## MasVnrArea -2.560e+01 6.369e+00 -4.020 5.99e-05 ***
## BsmtFinSF1 3.433e+01 2.554e+00 13.446 < 2e-16 ***
## BsmtFinSF2 NA NA NA NA
## BsmtUnfSF 6.537e+00 2.384e+00 2.742 0.006152 **
## TotalBsmtSF 1.135e+01 3.822e+00 2.970 0.003002 **
## X1stFlrSF 3.466e+01 3.893e+00 8.903 < 2e-16 ***
## X2ndFlrSF 2.861e+01 2.376e+00 12.040 < 2e-16 ***
## BsmtFullBath 2.670e+03 1.584e+03 1.686 0.092005 .
## BedroomAbvGr -4.589e+03 1.203e+03 -3.816 0.000139 ***
## TotRmsAbvGrd 6.621e+03 6.954e+02 9.522 < 2e-16 ***
## Fireplaces 6.697e+03 1.183e+03 5.663 1.66e-08 ***
## GarageArea 3.129e+01 4.240e+00 7.381 2.13e-13 ***
## ScreenPorch NA NA NA NA
## PoolArea NA NA NA NA
## MiscVal NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31210 on 2484 degrees of freedom
## Multiple R-squared: 0.842, Adjusted R-squared: 0.841
## F-statistic: 882.4 on 15 and 2484 DF, p-value: < 2.2e-16
# And I still choose GrLivArea over X1stFlrSF + X2ndFlrSF
# I keep FUll bath (need transformation) instead of BsmtFullBath
#i only keep TotRmsAbvGrd between BedroomAbvGr and TotRmsAbvGrd
dim(train2)
## [1] 2500 77
dim(test2)
## [1] 1500 75
# I combined YearBuit and YearRemodAdd into one predictor: Age
train2[78] <- data.frame("Age" = train2$YearRemodAdd - train2$YearBuilt)
test2[76] <- data.frame("Age" = test2$YearRemodAdd - test2$YearBuilt)
model.2 <- lm(SalePrice ~ LotArea + OverallQual + Age + BsmtFinSF1 + + BsmtUnfSF +TotalBsmtSF + GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageArea , data = train2)
summary(model.2)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + Age + BsmtFinSF1 +
## +BsmtUnfSF + TotalBsmtSF + GrLivArea + FullBath + TotRmsAbvGrd +
## Fireplaces + GarageArea, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154461 -17852 -2071 11827 378416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.335e+05 3.967e+03 -33.663 < 2e-16 ***
## LotArea 1.774e+00 2.298e-01 7.720 1.67e-14 ***
## OverallQual 2.743e+04 6.705e+02 40.912 < 2e-16 ***
## Age -5.688e+01 3.023e+01 -1.882 0.060 .
## BsmtFinSF1 3.413e+01 2.440e+00 13.990 < 2e-16 ***
## BsmtUnfSF 1.827e+00 2.413e+00 0.757 0.449
## TotalBsmtSF 2.073e+01 2.767e+00 7.492 9.33e-14 ***
## GrLivArea 1.946e+01 2.594e+00 7.503 8.64e-14 ***
## FullBath 7.804e+03 1.638e+03 4.764 2.01e-06 ***
## TotRmsAbvGrd 5.421e+03 6.818e+02 7.951 2.78e-15 ***
## Fireplaces 6.551e+03 1.198e+03 5.467 5.04e-08 ***
## GarageArea 3.923e+01 4.286e+00 9.151 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32180 on 2488 degrees of freedom
## Multiple R-squared: 0.8317, Adjusted R-squared: 0.8309
## F-statistic: 1117 on 11 and 2488 DF, p-value: < 2.2e-16
vif(model.2)
## LotArea OverallQual Age BsmtFinSF1 BsmtUnfSF TotalBsmtSF
## 1.321707 2.194310 1.202336 2.715034 2.446185 2.467341
## GrLivArea FullBath TotRmsAbvGrd Fireplaces GarageArea
## 3.145987 1.891017 2.345328 1.362708 1.760802
#delete predictors (vif greater than 5)
model.3 <- lm(SalePrice ~ LotArea + OverallQual + Age + GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageArea, data = train2)
summary(model.3)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + Age + GrLivArea +
## FullBath + TotRmsAbvGrd + Fireplaces + GarageArea, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -152071 -21373 -3368 16388 392619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.254e+05 4.333e+03 -28.932 < 2e-16 ***
## LotArea 2.763e+00 2.497e-01 11.065 < 2e-16 ***
## OverallQual 3.017e+04 7.141e+02 42.248 < 2e-16 ***
## Age -1.821e+02 3.305e+01 -5.510 3.96e-08 ***
## GrLivArea 2.138e+01 2.891e+00 7.398 1.89e-13 ***
## FullBath 6.734e+03 1.808e+03 3.724 0.000201 ***
## TotRmsAbvGrd 4.277e+03 7.571e+02 5.650 1.79e-08 ***
## Fireplaces 1.169e+04 1.310e+03 8.917 < 2e-16 ***
## GarageArea 5.755e+01 4.687e+00 12.279 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35880 on 2491 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7898
## F-statistic: 1175 on 8 and 2491 DF, p-value: < 2.2e-16
vif(model.3)
## LotArea OverallQual Age GrLivArea FullBath TotRmsAbvGrd
## 1.256462 2.002431 1.156760 3.142608 1.853543 2.326601
## Fireplaces GarageArea
## 1.311142 1.693859
# delete meaningless predictors(in my opinion) and we can see that R^2 does not get affected much
#the predictors inside model.4 is all the numerical predictors that we pick
model.4 <- lm(SalePrice ~ LotArea + OverallQual + Age + GrLivArea + FullBath + TotRmsAbvGrd + GarageArea, data = train2)
summary(model.4)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + Age + GrLivArea +
## FullBath + TotRmsAbvGrd + GarageArea, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149754 -21270 -3409 16881 406322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.341e+05 4.287e+03 -31.286 < 2e-16 ***
## LotArea 3.029e+00 2.518e-01 12.029 < 2e-16 ***
## OverallQual 3.154e+04 7.084e+02 44.521 < 2e-16 ***
## Age -1.967e+02 3.353e+01 -5.866 5.05e-09 ***
## GrLivArea 2.648e+01 2.878e+00 9.199 < 2e-16 ***
## FullBath 4.983e+03 1.826e+03 2.730 0.00639 **
## TotRmsAbvGrd 4.346e+03 7.689e+02 5.652 1.76e-08 ***
## GarageArea 5.706e+01 4.760e+00 11.988 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36440 on 2492 degrees of freedom
## Multiple R-squared: 0.7838, Adjusted R-squared: 0.7832
## F-statistic: 1291 on 7 and 2492 DF, p-value: < 2.2e-16
#pick categorial variables
attach(train2)
boxplot(SalePrice ~ Street)

boxplot(SalePrice ~ MSSubClass)

boxplot(SalePrice ~ LotShape)

boxplot(SalePrice ~ LandContour)

boxplot(SalePrice ~ Utilities)

boxplot(SalePrice ~ LotConfig)

boxplot(SalePrice ~ LandSlope)

boxplot(SalePrice ~ Neighborhood)

boxplot(SalePrice ~ Condition1)

boxplot(SalePrice ~ Condition2)

boxplot(SalePrice ~ BldgType)

boxplot(SalePrice ~ HouseStyle)

boxplot(SalePrice ~ RoofStyle)

boxplot(SalePrice ~ RoofMatl)

boxplot(SalePrice ~ Exterior1st)

boxplot(SalePrice ~ Exterior2nd)

boxplot(SalePrice ~ MasVnrType)

boxplot(SalePrice ~ ExterQual)

boxplot(SalePrice ~ ExterCond)

boxplot(SalePrice ~ Foundation)

boxplot(SalePrice ~ BsmtCond)

boxplot(SalePrice ~ BsmtQual)

boxplot(SalePrice ~ BsmtExposure)

boxplot(SalePrice ~ BsmtFinType1)

boxplot(SalePrice ~ BsmtFinType2)

boxplot(SalePrice ~ Heating)

boxplot(SalePrice ~ CentralAir)

boxplot(SalePrice ~ Electrical)

boxplot(SalePrice ~ KitchenQual)

boxplot(SalePrice ~ Functional)

boxplot(SalePrice ~ FireplaceQu)

boxplot(SalePrice ~ GarageType)

boxplot(SalePrice ~ GarageFinish)

boxplot(SalePrice ~ GarageQual)

boxplot(SalePrice ~ GarageCond)

boxplot(SalePrice ~ PavedDrive)

boxplot(SalePrice ~ SaleType)

#model with numerical and dummy
model2 <- lm(SalePrice ~ LotArea + OverallQual + Age + GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageArea, data = train2)
#transformation
tSalePrice <- log(SalePrice)
model3 <- lm(tSalePrice ~ LotArea + OverallQual + Age + GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageArea, data = train2)
summary(model3)
##
## Call:
## lm(formula = tSalePrice ~ LotArea + OverallQual + Age + GrLivArea +
## FullBath + TotRmsAbvGrd + Fireplaces + GarageArea, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91819 -0.07990 -0.00005 0.09415 0.66238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.051e+01 1.855e-02 566.495 < 2e-16 ***
## LotArea 1.185e-05 1.069e-06 11.082 < 2e-16 ***
## OverallQual 1.426e-01 3.057e-03 46.652 < 2e-16 ***
## Age -1.448e-03 1.415e-04 -10.232 < 2e-16 ***
## GrLivArea 1.556e-04 1.238e-05 12.574 < 2e-16 ***
## FullBath 4.883e-02 7.742e-03 6.307 3.36e-10 ***
## TotRmsAbvGrd 1.224e-02 3.241e-03 3.776 0.000163 ***
## Fireplaces 5.900e-02 5.610e-03 10.516 < 2e-16 ***
## GarageArea 3.443e-04 2.007e-05 17.157 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1536 on 2491 degrees of freedom
## Multiple R-squared: 0.8468, Adjusted R-squared: 0.8463
## F-statistic: 1721 on 8 and 2491 DF, p-value: < 2.2e-16
#I convert FullBath and GarageCars into dummy
dim(train2)
## [1] 2500 78
train2[79] <- data.frame("FullBathNew" = factor(train2$FullBath))
boxplot(train2$SalePrice ~ train2$FullBathNew)

train2[80] <- data.frame("GarageCarsNew" = factor(train2$GarageCars))
boxplot(train2$SalePrice ~ train2$GarageCarsNew)

dim(train2)
## [1] 2500 80
dim(test2)
## [1] 1500 76
test2[77] <- data.frame("FullBathNew" = factor(test2$FullBath))
test2[78] <- data.frame("GarageCarsNew" = factor(test2$GarageCars))
dim(test2)
## [1] 1500 78
#Final model
model4 <- lm(tSalePrice ~ LotArea + OverallQual + Age + GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageCars + BsmtFinSF1, data = train2)
summary(model4)
##
## Call:
## lm(formula = tSalePrice ~ LotArea + OverallQual + Age + GrLivArea +
## FullBath + TotRmsAbvGrd + Fireplaces + GarageCars + BsmtFinSF1,
## data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94774 -0.06624 0.00142 0.07934 0.68181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.050e+01 1.700e-02 617.698 < 2e-16 ***
## LotArea 1.016e-05 9.737e-07 10.435 < 2e-16 ***
## OverallQual 1.341e-01 2.856e-03 46.955 < 2e-16 ***
## Age -9.926e-04 1.312e-04 -7.564 5.44e-14 ***
## GrLivArea 1.542e-04 1.127e-05 13.681 < 2e-16 ***
## FullBath 5.566e-02 7.147e-03 7.787 9.97e-15 ***
## TotRmsAbvGrd 1.618e-02 2.966e-03 5.454 5.41e-08 ***
## Fireplaces 3.154e-02 5.214e-03 6.050 1.66e-09 ***
## GarageCars 7.841e-02 5.248e-03 14.941 < 2e-16 ***
## BsmtFinSF1 1.685e-04 7.134e-06 23.617 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1403 on 2490 degrees of freedom
## Multiple R-squared: 0.8723, Adjusted R-squared: 0.8718
## F-statistic: 1890 on 9 and 2490 DF, p-value: < 2.2e-16
plot(model4)




prediction = exp(predict(model4, newdata = test2))
prediction[1:10]
## 1 2 3 4 5 6 7 8
## 171266.1 175534.7 144937.6 142543.8 120112.1 207577.5 105137.8 169698.7
## 9 10
## 101879.8 110559.0
saleprice_prediction = data.frame(Ob = 1:1500, SalePrice = prediction)
write.csv(saleprice_prediction, file = 'Yuqing_Yang_101A-saleprice-predictions.csv', row.names = FALSE)